################################################################################
################################################################################
####### THIS SCRIPT CREATES A CONTROL VARIABLE MEASURING OPPOSITION IN THE EP ###
############## Mostly data wrangling with votewatch data #####################


library(haven)
library(readxl)
library(tidyverse)
library(lme4)
library(texreg)
library(sjPlot)
library(sjstats)
library(fixest)
library(ggplot2)
library(ggeffects)


## LOAD ROLL CALL 6 
RC_6 <- read_xlsx("Data/EP6_RCVs.xlsx")
RC_6 <- RC_6[-c(1:2),]
RC_6 <- RC_6[,c(1, 4, 111)]#111 is vote number 101, support for barosso commission
colnames(RC_6) <- c("mep_ids", "name", "Barr1")

#### LOAD ROLL CALL 7 
#### MISSING MEP IDS

RC_7 <- read_xlsx("Data/EP7_RCVs.xlsx")
RC_7 <- RC_7[,c(1,3, 4, 329)]#329 is vote number 320, support for barosso commission 2
colnames(RC_7) <- c("wrong_ID", "Lname", "name", "Barr2")

## LOAD ROLL CALL 8
RC_8 <- read_xlsx("Data/EP8_RCVs.xlsx")
RC_8 <- unite(RC_8, col = "name", c("Lname", "Fname"), sep = ", ", remove = FALSE)
RC_8 <- RC_8[,c(1, 2, 4, 97)]#97 is vote number 87, support for Juncker commission
colnames(RC_8) <- c("right_ids", "name", "Lname", "Juncker")

#### FIXING IDS OF THOSE FROM 7

#take out those with same family names
RC_7$unique1 <- duplicated(RC_7$Lname)
RC_7$unique2 <- duplicated(RC_7$Lname, fromLast = T)

RC_7_auto <- subset(RC_7, RC_7$unique1 == "FALSE" & RC_7$unique2 == "FALSE")#merge automatically
RC_7_auto <- RC_7_auto[,-c(5:6)]

RC_7_manu <- subset(RC_7, RC_7$unique1 == "TRUE" | RC_7$unique2 == "TRUE") #merge manually
RC_7_manu <- RC_7_manu[,-c(5:6)]

### get reelected
RC_67 <- left_join(RC_7_auto, RC_6, by = "name")### take all that got reelected
# get those who will be reelected
RC_678 <- left_join(RC_67, RC_8, by ="name")

### FOR THE REMAINDER
### take real mep ids
members <- read.csv2("Data/Members.csv", encoding = "UTF-8")
members <- members[,c(1:2)]
members <- distinct(members)
colnames(members) <- c("mep_ids", "name")
pattern <- "(?<![\\p{L}])[\\p{Lu}À-ÖØ-Þ]+(?:[\\s-][\\p{Lu}À-ÖØ-Þ]+)*(?:\\p{Lu}(?!\\p{L})|[\\p{Lu}À-ÖØ-Þ])?(?![\\p{L}])"

matches <- regmatches(members$name, gregexpr(pattern, members$name, perl = TRUE))
matches <- lapply(matches, paste, collapse = " ")
members$Lname <- unlist(matches)

members$unique1 <- duplicated(members$Lname)
members$unique2 <- duplicated(members$Lname, fromLast = T)
members_auto <- subset(members, members$unique1 == "FALSE" & members$unique2 == "FALSE")#merge automatically
members_manu <- subset(members, members$unique1 == "TRUE" | members$unique2 == "TRUE") #merge manually


colnames(members_auto)[1] <- "real_id" 
members_auto <- members_auto[,-c(4:5)]


RC_7_tofix1 <- subset(RC_678, is.na(RC_678$mep_ids) & is.na(RC_678$right_ids))
RC_7_tofix1 <- RC_7_tofix1[1:4]
colnames(RC_7_tofix1)[2] <- "Lname"

RC_7_auto_merged <- left_join(RC_7_tofix1, members_auto, by ="Lname") 

#### AUTOMATED MERGING SUCCESSFUL
merged_success <- subset(RC_7_auto_merged, !is.na(RC_7_auto_merged$real_id))### THESE ONES ARE DONE

###### CODE THESE BELOW AND THOSE WITH NON-UNIQUE NAMES
still_missing <- subset(RC_7_auto_merged, is.na(RC_7_auto_merged$real_id))

still_missing$real_id <- c("96750",
                           "28246",
                           "96954",
                           "28342",   
                           "1759", # BAUDIS
                           "97156",
                           "96856",   
                           "96973",
                           "96914",
                           "97129", #de MAGISTRIS
                           "96676",
                           "96729",
                           "23868",
                           "2125",
                           "96991", #gardiazabal
                           "96751",
                           "96734",
                           "96910",
                           "97196",
                           "96905",
                           "36281", # ticau
                           "97203",
                           "96898",
                           "2212",
                           "107041", # becker
                           "97344",
                           "107977",
                           "109649",
                           "110365",
                           "111018",
                           "119439")

#### now fix those with non unique names

RC_7_manu$real_id <- c("28308",
                       "96706",
                       "28117",
                       "96669",
                       "96667",
                       "96668",
                       "1909",
                       "38398", #lange2
                       "1023",
                       "28210",
                       "1403",
                       "4238",
                       "1911",
                       "96731", # werner schulz
                       "34254",
                       "96863",
                       "96678",
                       "28134",
                       "2319",
                       "28229",
                       "39713",
                       "39725",
                       "96764",
                       "102931",
                       "112620")

#### PUT ALL FOUR TOGETHER

merged_success <- merged_success[,c(1,3,4,5)]

colnames(merged_success) <- c("wrong_ID", "name", "Barr2", "final_id")

RC_7_fixed <- subset(RC_678, !is.na(RC_678$mep_ids) | !is.na(RC_678$right_ids))
RC_7_fixed$final_id <- ifelse(is.na(RC_7_fixed$mep_ids), RC_7_fixed$right_ids, RC_7_fixed$mep_ids)
anyNA(RC_7_fixed$final_id)
RC_7_fixed <- RC_7_fixed[,c(1,3,4,10)]


still_missing <- still_missing[,c(1,3,4,5)]
colnames(still_missing)[4] <- "final_id"
colnames(still_missing)[2] <- "name"

RC_7_manu <- RC_7_manu[,c(1,3,4,5)]
colnames(RC_7_manu)[4] <- "final_id"

### fix three who got assigend wrong ids
fixed_RC7 <- rbind(RC_7_fixed, merged_success, RC_7_manu, still_missing)
fixed_RC7$final_id <- ifelse(fixed_RC7$name == "JENSEN, Anne E.", "96936", fixed_RC7$final_id) #fix ANNE JENSEN
fixed_RC7$final_id <- ifelse(fixed_RC7$name == "MATO ADROVER, Gabriel", "4440", fixed_RC7$final_id) # FIX MATO

fixed_RC7$Barr2 <- ifelse(fixed_RC7$name == "JENSEN, Anne E.", 1, fixed_RC7$Barr2) #fix ANNE JENSEN
fixed_RC7$Barr2 <- ifelse(fixed_RC7$name == "MATO ADROVER, Gabriel", 1, fixed_RC7$Barr2) # FIX MATO

fixed_RC7$final_id <- ifelse(fixed_RC7$name == "GALLO, Marielle", "96896", fixed_RC7$final_id) #fix GALLO
fixed_RC7$Barr2 <- ifelse(fixed_RC7$name == "MATO ADROVER, Gabriel", 1, fixed_RC7$Barr2) # FIX GALLO

nrow(fixed_RC7) - nrow(RC_7)
#####
fixed_RC7 <- fixed_RC7[,-1]
colnames(fixed_RC7)[3] <- "mep_ids"


### create coa variable 
RC_6$EP_Coa <- ifelse(RC_6$Barr1 == "1", 0, 1)# 0 is europhile core, 1 is opposition
fixed_RC7$EP_Coa <- ifelse(fixed_RC7$Barr2 == "1", 0, 1)# 0 is europhile core, 1 is opposition
RC_8$EP_Coa <- ifelse(RC_8$Juncker == "1", 0, 1)# 

RC_6$Barr1 <- NULL
RC_6$name <- NULL

fixed_RC7$Barr2 <- NULL
fixed_RC7$name <- NULL

RC_8$Juncker <- NULL
RC_8$name <- NULL

colnames(RC_8) <- c("mep_ids", "Lname", "EP_Coa")
RC_8$Lname <- NULL
RC_8$mep_ids <- as.character(RC_8$mep_ids)

rm(list=setdiff(ls(), c("RC_6", "fixed_RC7", "RC_8")))



#####

load("Data/EP_debates_11032023.Rdata")

levels(EP_debates$EU_party)
levels(EP_debates$EU_party) <- c("NGL", "EFD/EFDD", "ECR",
                                 "EDD", "ALDE",  "EPP",
                                 "Greens","S&D", "ID",
                                 "No Group", "UEN", "EFD/EFDD", 
                                 "NGL", "Greens", "No Group")
levels(EP_debates$EU_party)


### SUBSET EP DATA FOR CONVENIENCE
EP_9904 <- subset(EP_debates, EP_debates$date < "2004-06-01")
EP_0409 <- subset(EP_debates, EP_debates$date > "2004-06-01" & EP_debates$date < "2009-06-04")
EP_0914 <- subset(EP_debates, EP_debates$date > "2009-06-04" & EP_debates$date < "2014-05-22")
EP_1419 <- subset(EP_debates, EP_debates$date >= "2014-05-22")


EP_9904$EP_Coa <- ifelse(EP_9904$EU_party == "EPP" | EP_9904$EU_party == "ALDE" | EP_9904$EU_party == "S&D", 0,1 )# 0 is europhile core, 1 is opposition
EP_0409 <- left_join(EP_0409, RC_6, by = "mep_ids")
EP_0914 <- left_join(EP_0914, fixed_RC7, by = "mep_ids")
EP_1419 <- left_join(EP_1419, RC_8, by = "mep_ids")

EP_debates <- rbind(EP_9904, EP_0409, EP_0914, EP_1419)
##### 

save(EP_debates, file = "Data/EP_debates_11032023.Rdata")


# checked and last run on 12.03.2023